library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.4 v stringr 1.4.0
## v tidyr 1.1.3 v forcats 0.5.1
## v readr 2.0.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(tidyr)
library(ggplot2)
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
library(NbClust)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(plotly)
## Warning: package 'plotly' was built under R version 4.1.2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(cluster)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
df=read.csv("E:\\GRAD\\fda\\Project1\\World Indicators.csv",na.strings = "")
df$Energy.Usage=NULL
df$Lending.Interest=NULL
colSums(is.na(df))
## ï..Birth.Rate Business.Tax.Rate Days.to.Start.Business
## 9 27 27
## GDP Health.Exp...GDP Health.Exp.Capita
## 20 23 23
## Hours.to.do.Tax Infant.Mortality.Rate Internet.Usage
## 28 20 9
## Life.Expectancy.Female Life.Expectancy.Male Mobile.Phone.Usage
## 11 11 12
## Population.0.14 Population.15.64 Population.65.
## 17 17 17
## Population.Urban Region Country
## 2 0 0
library(ggplot2)
#Outkier percentage return function
perc_out=function(a){
length(which(a> mean(a) + 3 * sd(a) | a < mean(a) - 3 * sd(a)) ) / length(a)
}
names(df)
## [1] "ï..Birth.Rate" "Business.Tax.Rate" "Days.to.Start.Business"
## [4] "GDP" "Health.Exp...GDP" "Health.Exp.Capita"
## [7] "Hours.to.do.Tax" "Infant.Mortality.Rate" "Internet.Usage"
## [10] "Life.Expectancy.Female" "Life.Expectancy.Male" "Mobile.Phone.Usage"
## [13] "Population.0.14" "Population.15.64" "Population.65."
## [16] "Population.Urban" "Region" "Country"
#Cleaning Birth rate
boxplot(df$ï..Birth.Rate)
df$ï..Birth.Rate[is.na(df$ï..Birth.Rate)]=mean(df$ï..Birth.Rate,trim=2*perc_out(na.omit(df$ï..Birth.Rate)),na.rm = TRUE)
ggplot(df, aes(x=ï..Birth.Rate)) + geom_density()
df$ï..Birth.Rate=df$ï..Birth.Rate**0.25
#cleaning Business.Tax.Rate
df$Business.Tax.Rate=gsub(pattern="%",replacement="",df$Business.Tax.Rate)
df$Business.Tax.Rate=as.numeric(df$Business.Tax.Rate)
boxplot(df$Business.Tax.Rate)
df$Business.Tax.Rate[is.na(df$Business.Tax.Rate)]=mean(df$Business.Tax.Rate,trim=2*perc_out(na.omit(df$Business.Tax.Rate)),na.rm = TRUE)
ggplot(df, aes(x=Business.Tax.Rate)) + geom_density()
df$Business.Tax.Rate=df$Business.Tax.Rate**0.2
#cleaning Days.to.Start.Business
boxplot(df$Days.to.Start.Business)
df$Days.to.Start.Business[is.na(df$Days.to.Start.Business)]=mean(df$Days.to.Start.Business,trim=2*perc_out(na.omit(df$Days.to.Start.Business)),na.rm = TRUE)
ggplot(df, aes(x=Days.to.Start.Business)) + geom_density()
df$Days.to.Start.Business=df$Days.to.Start.Business**0.2
#Cleaning GDP
df$GDP=gsub(pattern="[[:punct:]]",replacement="",df$GDP)
df$GDP=as.numeric(df$GDP)
boxplot(df$GDP)
df$GDP[is.na(df$GDP)]=mean(df$GDP,trim=2*perc_out(na.omit(df$GDP)),na.rm = TRUE)
ggplot(df, aes(x=GDP)) + geom_density()
df$GDP=df$GDP**0.1
#cleaning Health.Exp...GDP
boxplot(df$Health.Exp...GDP)
df$Health.Exp...GDP[is.na(df$Health.Exp...GDP)]=mean(df$Health.Exp...GDP,trim=2*perc_out(na.omit(df$Health.Exp...GDP)),na.rm = TRUE)
ggplot(df, aes(x=Health.Exp...GDP)) + geom_density()
df$Health.Exp...GDP=df$Health.Exp...GDP**0.8
#Cleaning Health.Exp.Capita
df$Health.Exp.Capita=gsub(pattern="[[:punct:]]",replacement="",df$Health.Exp.Capita)
df$Health.Exp.Capita=as.numeric(df$Health.Exp.Capita)
boxplot(df$Health.Exp.Capita)
df$Health.Exp.Capita[is.na(df$Health.Exp.Capita)]=mean(df$Health.Exp.Capita,trim=2*perc_out(na.omit(df$Health.Exp.Capita)),na.rm = TRUE)
ggplot(df, aes(x=Health.Exp.Capita)) + geom_density()
df$Health.Exp.Capita=df$Health.Exp.Capita**0.2
#Cleaning Hours.to.do.Tax
boxplot(df$Hours.to.do.Tax)
df$Hours.to.do.Tax[is.na(df$Hours.to.do.Tax)]=mean(df$Hours.to.do.Tax,trim=2*perc_out(na.omit(df$Hours.to.do.Tax)),na.rm = TRUE)
ggplot(df, aes(x=Hours.to.do.Tax)) + geom_density()
df$Hours.to.do.Tax=df$Hours.to.do.Tax**0.2
boxplot(df$Infant.Mortality.Rate)
df$Infant.Mortality.Rate[is.na(df$Infant.Mortality.Rate)]=mean(df$Infant.Mortality.Rate,trim=2*perc_out(na.omit(df$Infant.Mortality.Rate)),na.rm = TRUE)
ggplot(df, aes(x=Infant.Mortality.Rate)) + geom_density()
df$Infant.Mortality.Rate=df$Infant.Mortality.Rate**0.33
boxplot(df$Internet.Usage)
df$Internet.Usage[is.na(df$Internet.Usage)]=mean(df$Internet.Usage,trim=2*perc_out(na.omit(df$Internet.Usage)),na.rm = TRUE)
ggplot(df, aes(x=Internet.Usage)) + geom_density()
boxplot(df$Life.Expectancy.Female)
df$Life.Expectancy.Female[is.na(df$Life.Expectancy.Female)]=mean(df$Life.Expectancy.Female,trim=2*perc_out(na.omit(df$Life.Expectancy.Female)),na.rm = TRUE)
ggplot(df, aes(x=Life.Expectancy.Female)) + geom_density()
df$Life.Expectancy.Female=df$Life.Expectancy.Female**3
df$Life.Expectancy.Female=df$Life.Expectancy.Female**2
boxplot(df$Life.Expectancy.Male)
df$Life.Expectancy.Male[is.na(df$Life.Expectancy.Male)]=mean(df$Life.Expectancy.Male,trim=2*perc_out(na.omit(df$Life.Expectancy.Male)),na.rm = TRUE)
ggplot(df, aes(x=Life.Expectancy.Male)) + geom_density()
df$Life.Expectancy.Male=df$Life.Expectancy.Male**2
boxplot(df$Mobile.Phone.Usage)
df$Mobile.Phone.Usage[is.na(df$Mobile.Phone.Usage)]=mean(df$Mobile.Phone.Usage,trim=2*perc_out(na.omit(df$Mobile.Phone.Usage)),na.rm = TRUE)
ggplot(df, aes(x=Mobile.Phone.Usage)) + geom_density()
boxplot(df$Population.0.14)
df$Population.0.14[is.na(df$Population.0.14)]=mean(df$Population.0.14,trim=2*perc_out(na.omit(df$Population.0.14)),na.rm = TRUE)
ggplot(df, aes(x=Population.0.14)) + geom_density()
boxplot(df$Population.15.64)
df$Population.15.64[is.na(df$Population.15.64)]=mean(df$Population.15.64,trim=2*perc_out(na.omit(df$Population.15.64)),na.rm = TRUE)
ggplot(df, aes(x=Population.15.64)) + geom_density()
boxplot(df$Population.65.)
df$Population.65.[is.na(df$Population.65.)]=mean(df$Population.65.,trim=2*perc_out(na.omit(df$Population.65.)),na.rm = TRUE)
ggplot(df, aes(x=Population.65.)) + geom_density()
df$Population.65.=df$Population.65.**0.33
boxplot(df$Population.Urban)
df$Population.Urban[is.na(df$Population.Urban)]=mean(df$Population.Urban,trim=2*perc_out(na.omit(df$Population.Urban)),na.rm = TRUE)
ggplot(df, aes(x=Population.Urban)) + geom_density()
x=data.frame(scale(df[,1:16]))
y=df[,18]
#Finding optimal number of clusters
fviz_nbclust(x,kmeans,method="wss")
fviz_nbclust(df[,1:3],kmeans,method="silhouette")
#apply Kmeans
km=kmeans(x,2,nstart=20)
df$class_k=km$cluster
#Apply heirarchical clustering
hc.single=hclust(dist(x),method="centroid")
plot(hc.single)
df$class_h=cutree(hc.single,7)
sil_k <- silhouette(df$class_k, dist(x))
plot(sil_k, main ="Silhouette plot - K-means")
sil_h<- silhouette(df$class_h, dist(x))
plot(sil_h, main ="Silhouette plot - hierarchical-means")
#Filling summary
compare=data.frame(matrix(ncol = 3, nrow = 0))
names(compare)=c("Parameter","Kmeans","hierarchical")
compare=rbind(compare,c("Silhoutte",mean(sil_k[,3]),mean(sil_h[,3])))
#Find Dunn index to make sure they have been clustered well
library(fpc)
k_stats <- cluster.stats(dist(x), df$class_k)
h_stats <- cluster.stats(dist(x), df$class_h)
compare=rbind(compare,c("Dunn index",k_stats$dunn,h_stats$dunn))
names(compare)=c("Parameter","Kmeans","hierarchical")
print(compare)
## Parameter Kmeans hierarchical
## 1 Silhoutte 0.335333571169999 -0.0297076394297334
## 2 Dunn index 0.154051727097344 0.133924939621907
#It is clear that Kmeans performed well tha Heirarchical method since silhoutte index and dunn index is more than the other
#Comparing Health expense capita and GDP
plot_ly(df, x = ~Health.Exp.Capita, y = ~GDP, color = ~as.factor(class_k))
## No trace type specified:
## Based on info supplied, a 'scatter' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#scatter
## No scatter mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
#From the graph it is clear that countries lying on 1st and 4th quarants are considered as a group
#Comparing Infant.Mortality.Rate and GDP
plot_ly(df, x = ~Infant.Mortality.Rate, y = ~GDP, color = ~as.factor(class_k))
## No trace type specified:
## Based on info supplied, a 'scatter' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#scatter
## No scatter mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
#From the graph it is clear that countries lying on 1st and 4th quarants are considered as a group
#Comparing Health.Exp.Capita ,ï..Birth.Rate,Health.Exp...GDP
plot_ly(df, x = ~Health.Exp...GDP, y = ~Health.Exp.Capita, z=df$ï..Birth.Rate, color = ~as.factor(class_k))
## No trace type specified:
## Based on info supplied, a 'scatter3d' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#scatter3d
## No scatter3d mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
#From the graph it is clear that countries lying on 1st and 4th quarants are considered as a group
names(df)
## [1] "ï..Birth.Rate" "Business.Tax.Rate" "Days.to.Start.Business"
## [4] "GDP" "Health.Exp...GDP" "Health.Exp.Capita"
## [7] "Hours.to.do.Tax" "Infant.Mortality.Rate" "Internet.Usage"
## [10] "Life.Expectancy.Female" "Life.Expectancy.Male" "Mobile.Phone.Usage"
## [13] "Population.0.14" "Population.15.64" "Population.65."
## [16] "Population.Urban" "Region" "Country"
## [19] "class_k" "class_h"
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.